Lets start with LASSO

bwt_df = 
  read_csv("./data/extra_topic_data/birthweight.csv") %>% 
  janitor::clean_names() %>%
  mutate(
    babysex = as.factor(babysex),
    babysex = fct_recode(babysex, "male" = "1", "female" = "2"),
    frace = as.factor(frace),
    frace = fct_recode(frace, "white" = "1", "black" = "2", "asian" = "3", 
                       "puerto rican" = "4", "other" = "8"),
    malform = as.logical(malform),
    mrace = as.factor(mrace),
    mrace = fct_recode(mrace, "white" = "1", "black" = "2", "asian" = "3", 
                       "puerto rican" = "4")) %>% 
  sample_n(200)
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.

To use the lasso, we will use glmnet

y = bwt_df$bwt

x = model.matrix(bwt ~ ., bwt_df)[,-1]
lasso_fit = glmnet(x, y)

Make sure you set those lambdas to make sure its the same

lambda = 10^(seq(3, -2, -0.1))

lasso_fit =
  glmnet(x, y, lambda = lambda)

lasso_cv =
  cv.glmnet(x, y, lambda = lambda)

lambda_opt = lasso_cv$lambda.min

Still going to use the broom packeage on the lasso fit because it will look better

broom::tidy(lasso_fit) %>% 
  select(term, lambda, estimate) %>% 
  complete(term, lambda, fill = list(estimate = 0) ) 
## # A tibble: 1,020 x 3
##    term        lambda estimate
##    <chr>        <dbl>    <dbl>
##  1 (Intercept) 0.01     -4524.
##  2 (Intercept) 0.0126   -4510.
##  3 (Intercept) 0.0158   -4493.
##  4 (Intercept) 0.0200   -4472.
##  5 (Intercept) 0.0251   -4444.
##  6 (Intercept) 0.0316   -4411.
##  7 (Intercept) 0.0398   -4367.
##  8 (Intercept) 0.0501   -4312.
##  9 (Intercept) 0.0631   -4242.
## 10 (Intercept) 0.0794   -4152.
## # … with 1,010 more rows
broom::tidy(lasso_fit) %>% 
  select(term, lambda, estimate) %>% 
  complete(term, lambda, fill = list(estimate = 0) ) %>% 
  filter(term != "(Intercept)") %>% 
  ggplot(aes(x = log(lambda, 10), y = estimate, group = term, color = term)) + 
  geom_path() + 
  geom_vline(xintercept = log(lambda_opt, 10), color = "blue", size = 1.2) +
  theme(legend.position = "none")

This is the cross validation part

broom::tidy(lasso_cv) %>% 
  ggplot(aes(x = log(lambda, 10), y = estimate)) + 
  geom_point() 

The blue line in the first model is the smallest lamnda will be for the set of predictions thats are present from the second graph.

CLustering

poke_df = 
  read_csv("./data/extra_topic_data/pokemon.csv") %>% 
  janitor::clean_names() %>% 
  select(hp, speed)
## Parsed with column specification:
## cols(
##   `#` = col_double(),
##   Name = col_character(),
##   `Type 1` = col_character(),
##   `Type 2` = col_character(),
##   Total = col_double(),
##   HP = col_double(),
##   Attack = col_double(),
##   Defense = col_double(),
##   `Sp. Atk` = col_double(),
##   `Sp. Def` = col_double(),
##   Speed = col_double(),
##   Generation = col_double(),
##   Legendary = col_logical()
## )
poke_df %>% 
  ggplot(aes(x = hp, y = speed)) + 
  geom_point()

We clustersing now

kmeans_fit =
  kmeans(x = poke_df, centers = 3)

We processing and plotting

poke_df =
  broom::augment(kmeans_fit, poke_df)

poke_df %>% 
  ggplot(aes(x = hp, y = speed, color = .cluster)) +
  geom_point()

Clustering Trajectories

traj_data = 
  read_csv("./data/extra_topic_data/trajectories.csv")
## Parsed with column specification:
## cols(
##   subj = col_double(),
##   week = col_double(),
##   value = col_double()
## )
traj_data %>% 
  ggplot(aes(x = week, y = value, group = subj)) + 
  geom_point() + 
  geom_path()

Now we need to get intercepts and slopes for eveyone

int_slope_df = 
  traj_data %>% 
  nest(data = week:value) %>% 
  mutate(
    models = map(data, ~lm(value ~ week, data = .x)),
    result = map(models, broom::tidy)
  ) %>% 
  select(-data, -models) %>% 
  unnest(result) %>% 
  select(subj, term, estimate) %>% 
  pivot_wider(
    names_from = term,
    values_from = estimate
  ) %>% 
  rename(int = "(Intercept)", slope = week)

Try to kmeans this but first we gonna plot the intercept and slope

int_slope_df %>% 
  ggplot(aes(x = int, y = slope)) + 
  geom_point()

Have to do some processing, because it will treat the sunject as something to be clustered but thats not the case

km_fit = 
  kmeans(
    x = int_slope_df %>% 
      select(-subj) %>% 
      scale,  centers = 2)

int_slope_df =
  broom::augment(km_fit, int_slope_df)

We clustered this

int_slope_df %>% 
  ggplot(aes(x = int, y = slope, color = .cluster)) +
  geom_point()

We joined the two together

left_join(traj_data, int_slope_df) %>% 
  ggplot(aes(x = week, y = value, group = subj, color = .cluster)) + 
  geom_point() + 
  geom_path() 
## Joining, by = "subj"